#Dennis Moskov, Master Thesis
#RT for model fitness
#conversion, selectivity and yield

#using "rpart" package
#install.packages("rpart")
#library(rpart)

#packages for fancy plot
#fancy tree plot
#install.packages("rattle")
#install.packages("rpart.plot")
#library(rattle)
#library(rpart.plot)


#randomly shuffle the data
set.seed(42)                      # seed for reproducibility
DBt<-DB[sample(nrow(DB)),]

#initiate possible results
results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(length(DBt)-2,length(DBt)-1,length(DBt)))

#loop through different outcomes
for (r in 1:3) {

#use desired outcome
useDB<-DBt[-c(1,as.numeric(results[3,-r]))]

#initiate results lists
res<-list()
res.names<-c("number","fitted","observed","residuals","residuals_squared")


#initiate matrices for regression and prediction values
reg<-matrix(, nrow = 10,ncol = 1)
colnames(reg) <-  "overall"
rownames(reg) <- c("Sample","RSS","TSS","MSS","R","R","adj.R","MSE","RMSE","SDEC")
predic<-matrix(,nrow=4,ncol=1)
colnames(predic) <-  "overall"
rownames(predic) <- c("PRESS","Q","PSE","SDEP")

#initiate list for used variables for tree construction
tvar<-vector("list", 1)

#initiate matrix for variable importance analysis
vi<-list()

    #grow tree
    form <- paste(names(useDB)[length(useDB)], "~", paste(names(useDB)[-length(useDB)], collapse=" + "))
    fit<-rpart(form, data=useDB, method="anova")
    
    #prune tree
    pfit<-prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"] )

    #predict 
    pred <- predict(pfit,useDB)

    #save results for prediction 
    res$number<-as.numeric(names(pred))
    res$fitted<-unname(pred)
    res$observed<-useDB[,length(useDB)]
    res$residuals<-res$observed-res$fitted
    res$residuals_squared<-(res$residuals)^2

    #evaluation values for model fitness for each fold
    reg["Sample",1]<-nrow(useDB)                                #number of datapoints in training set of the fold
    reg["RSS",1]<-sum(res$residuals_squared)                     #Residual Sum of Squares
    reg["TSS",1]<- sum((res$observed-mean(res$observed))^2)   #Total Sum of Squares
    reg["MSS",1]<-reg["TSS",1]-reg["RSS",1]                         #Model Sum of Squares
    reg["R",1]<-reg["MSS",1]/reg["TSS",1]                          #coefficient of determination
    reg["R",1]<-sqrt(reg["R",1])                                   #multiple correlation coefficient
    reg["adj.R",1]<- 1-((1-reg["R",1])*((reg["Sample",1]-1)/(reg["Sample",1]-(length(levels(fit$frame$var))-1))))            
 								    #Adjusted coefficient of determination 
    reg["MSE",1]<-reg["RSS",1]/(reg["Sample",1]-(length(levels(fit$frame$var))-1))                      
    								    #Mean square error
    reg["RMSE",1]<-sqrt(reg["MSE",1])                               #Root Mean square error (residual standard deviation)
    reg["SDEC",1]<-sqrt(reg["RSS",1]/reg["Sample",1])               #Standard Deviation Error in Calculation

#save used variables 
tvar<-levels(fit$frame$var)[-1]

#save variable importance 
vi<-pfit[13]

#evaluation values for goodness of prediction                                     
predic["PRESS",1]<-sum(res$residuals_squared)                            #Predictive Error Sum of Squares
predic["Q",1]<-1-(predic["PRESS",1]/reg["TSS",1])               #Cross-validated predictive R2
predic["PSE",1]<-predic["PRESS",1]/length(res$residuals_squared)         #predictive squared error
predic["SDEP",1]<-sqrt(predic["PSE",1])                                  #standard deviation error in prediction

View(reg)
View(predic)

#plot regression tree
x11()
fancyRpartPlot(pfit,main=paste("Regression Tree for Fitting of MeOH",results[1,r]),sub="")

 
#-----------SAVE-----------------------------------------------------------------------------

#save regression trees to file
pdf(paste(results[1,r]," Tree.pdf"))
fancyRpartPlot(pfit,main=paste("Regression Tree for Fitting of MeOH",results[1,r]),sub="")
dev.off()

#regression and prediction results
write.csv(reg, file =paste(results[1,r]," regression_values.csv"))
write.csv(predic, file =paste(results[1,r]," prediction_values.csv"))
write.csv(res, file =paste(results[1,r]," prediction_results.csv"), row.names=FALSE)
write.csv(tvar,file=paste(results[1,r]," Variables_Tree_Construction.csv"))
write.csv(vi, file =paste(results[1,r]," variable_importance.csv"))

#plot predicted vs. observed
png(filename=paste(results[1,r]," fittedVSobs full.png"))
par(new=TRUE, pch=16)
plot(res$fitted,res$observed,pch=16, col="blue",xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Fitted vs. Observed MeOH",results[1,r]))
abline(0,1)
dev.off()

png(filename=paste(results[1,r]," fittedVSobs croped.png"))
par(new=TRUE, pch=16)
plot(res$fitted,res$observed,pch=16, col="blue",xlim=c(min(0,min(res$fitted)),max(max(res$observed),max(res$fitted))),ylim=c(min(0,min(res$fitted)),max(max(res$observed),max(res$fitted))),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Fitted vs. Observed MeOH",results[1,r]))
abline(0,1)
dev.off()

#plot predicted vs. residuals
png(filename=paste(results[1,r]," fittedVSres full.png"))
plot(res$fitted,res$residuals, col="blue",xlim=c(0,1),ylim=c(-1,1),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab="Residuals",main=paste("Fitted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

png(filename=paste(results[1,r]," fittedVSres croped.png"))
plot(res$fitted,res$residuals, col="blue",xlim=c(min(res$fitted),max(res$fitted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab="Residuals",main=paste("Fitted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()


#plot residual density
png(filename=paste(results[1,r]," resDensity.png"))
plot(density(res$residuals),xlab="Residuals", ylab="Density",main=paste("Density Plot of Residuals for",results[1,r]))
dev.off()

}



